import random
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from torchtext.data import Field, BucketIterator, TabularDataset
import common

save_path = f"{common.get_root_path()}/json/"
num = 1000


def tokenize_eng(text):
    spacy_eng = spacy.load("en")
    return [tok.text for tok in spacy_eng.tokenizer(text)]


context = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True)
answer = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True)

fields = {"context": ("c", context), "answer": ("a", answer)}

train_data, test_data = TabularDataset.splits(
    path="", train=f"{save_path}/train_num={num}.json", test=f"{save_path}/test_num={num}.json", format="json",
    fields=fields
)

context.build_vocab(train_data, max_size=10000, min_freq=2)
answer.build_vocab(train_data, max_size=10000, min_freq=2)


class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional=True)

        self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
        self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)
        self.dropout = nn.Dropout(p)

    def forward(self, x):
        # x: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        encoder_states, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        # Use forward, backward cells and hidden through a linear layer
        # so that it can be input to the decoder which is not bidirectional
        # Also using index slicing ([idx:idx+1]) to keep the dimension
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))

        return encoder_states, hidden, cell


class Decoder(nn.Module):
    def __init__(
            self, input_size, embedding_size, hidden_size, output_size, num_layers, p
    ):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(hidden_size * 2 + embedding_size, hidden_size, num_layers)

        self.energy = nn.Linear(hidden_size * 3, 1)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p)
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()

    def forward(self, x, encoder_states, hidden, cell):
        x = x.unsqueeze(0)
        # x: (1, N) where N is the batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        sequence_length = encoder_states.shape[0]
        h_reshaped = hidden.repeat(sequence_length, 1, 1)
        # h_reshaped: (seq_length, N, hidden_size*2)

        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
        # energy: (seq_length, N, 1)

        attention = self.softmax(energy)
        # attention: (seq_length, N, 1)

        # attention: (seq_length, N, 1), snk
        # encoder_states: (seq_length, N, hidden_size*2), snl
        # we want context_vector: (1, N, hidden_size*2), i.e knl
        context_vector = torch.einsum("snk,snl->knl", attention, encoder_states)

        rnn_input = torch.cat((context_vector, embedding), dim=2)
        # rnn_input: (1, N, hidden_size*2 + embedding_size)

        outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs).squeeze(0)
        # predictions: (N, hidden_size)

        return predictions, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(answer.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        encoder_states, hidden, cell = self.encoder(source)

        # First input will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # At every time step use encoder_states and update hidden, cell
            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)

            # Store prediction for current time step
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs


### We're ready to define everything we need for training our Seq2Seq model ###
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = False
save_model = True

# Training hyperparameters
num_epochs = 100
learning_rate = 3e-4
batch_size = 32

# Model hyperparameters
input_size_encoder = len(context.vocab)
input_size_decoder = len(answer.vocab)
output_size = len(answer.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 1
enc_dropout = 0.0
dec_dropout = 0.0

# Tensorboard to get nice loss plot
writer = SummaryWriter(f"runs/loss_plot")
step = 0

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), batch_size=32, device="cuda"
)

encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)

decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout,
).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = answer.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model:
    try:
        load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
    except:
        print("can't find the load model, skip")

sentence = (
    "Observations and a Model of the Mean Circulation over the Middle Atlantic Bight Continental ShelfAnalyses of current time series longer than 200 days from 33 sites over the Middle Atlantic Bight continental shelf reveal a consistent mean circulation pattern. The mean depth-averaged flow is equatorward, alongshelf, and increases with increasing water depth from 3 cm s \u03ea1 at the 15-m isobath to 10 cm s\nat the 100-m isobath. The mean cross-shelf circulation exhibits a consistent cross-shelf and vertical structure. The near-surface flow is typically offshore (positive, range \u03ea3 to 6 cm s\n). The interior flow is onshore and remarkably constant (\u03ea0.2 to \u03ea1.4 cm s\n). The near-bottom flow increases linearly with increasing water depth from \u03ea1 cm s \u03ea1 (onshore) in shallow water to 4 cm s \u03ea1 (offshore) at the 250-m isobath over the slope, with the direction reversal near the 50-m isobath.\nA steady, two-dimensional model (no along-isobath variations in the flow) reproduces the main features of the observed circulation pattern. The depth-averaged alongshelf flow is primarily driven by an alongshelf pressure gradient (sea surface slope of 3.7 \u03eb 10 \u03ea8 increasing to the north) and an opposing mean wind stress that also drives the near-surface offshore flow. The alongshelf pressure gradient accounts for both the increase in the alongshelf flow with water depth and the geostrophic balance onshore flow in the interior. The increase in the near-bottom offshore flow with water depth is due to the change in the relative magnitude of the contributions from the geostrophic onshore flow that dominates in shallow water and the offshore flow driven by the bottom stress that dominates in deeper water. It has been 25 yr or more since the overviews of the mean circulation of the Middle Atlantic Bight (MAB) and Georges Bank continental shelves by Bumpus (1973) , Beardsley et al. (1976) , and Beardsley and Boicourt (1981) . In that time the number of currentmeter records of more than 6-month duration in the MAB has increased substantially as a result of several major field programs including the Nantucket Shoals Flux Experiment (NSFE) (Beardsley et al. 1985) , the Shelf Edge Exchange Processes (SEEP-I and SEEP-II) studies (Walsh et al. 1988; Aikman et al. 1988; Biscaye et al. 1994; Shaw et al. 1994) , the Coastal Mixing and Optics (CMO) study , the Minerals Management Service Hatteras Study off North Carolina (Churchill and Berger 1998) , and the Global Ocean Ecosystems Dynamics (GLOBEC) Georges Bank study (Werner et al. 2003; Irish et al. 2005) . Additional long-term current observations over the inner shelf have been made at the Martha's Vineyard Coastal Observatory (MVCO) (Austin et al. 2002) and the Long-term Ecosystem Observatory (LEO-15) off New Jersey (Schofield et al. 2002) . With the recent interest in climate and coastal observatories, it seems timely to review what is known about the MAB mean circulation and dynamics in the context of these more recent moored observations. A quote from Bumpus (1973) seems appropriate to the present situation:\nInasmuch as the literature about the [east coast of the United States] continental shelf is copious and scattered, it appears warranted to this author to review the information we now have to provide a firm basis for designing future research programs.\nCurrent time series of 200 days or longer over the MAB shelf were collected and analyzed to characterize the mean circulation pattern. The main features of the observed circulation pattern are shown to be quantitatively consistent with a simple dynamical model, similar to a model originally proposed by Csanady (1976) . A thorough and very interesting overview of what was known about the MAB mean circulation and dynamics prior to 1981 is presented by Beardsley and Boicourt (1981) . A briefer, updated overview is given here for completeness.\nThe MAB continental shelf extends from Cape Hatteras (North Carolina) in the south to Nantucket Shoals (south of Cape Cod, Massachusetts) in the northeast (Fig. 1) . The southern flank of Georges Bank is included as part of the MAB in this study because it is continuous with the MAB shelf to the west for water depths greater than 60 m. The southern flank of Georges Bank is separated from the Scotian shelf to the northeast by the 200-m-deep Northeast Channel and from the MAB to the southwest by the 60-m-deep Great South Channel. The Hudson shelf valley extends across almost the entire MAB shelf separating the New England shelf (between Great South Channel and Hudson shelf valley) from the central MAB (between Hudson shelf valley and Chesapeake Bay). The coastline and isobath orientations are aligned roughly northeastsouthwest on the southern flank of Georges Bank and in the central MAB, east-west on the New England shelf, and north-south in the southern MAB (between Chesapeake Bay and Cape Hatteras). The MAB shelf width decreases from about 100 km in the northern half of the MAB to less than 30 km at Cape Hatteras. The depth of the shelf break also decreases from \u03f3100 m in the north to 40 m or less in the southern MAB. The bottom slope (h x ) over the shelf varies, but is typically FIG. 1 . Map of the Middle Atlantic Bight showing locations of current time series longer than 200 days, mean depth-averaged current vectors (blue), and mean wind stress vectors (red). For clarity, only selected mean current vectors are shown for sites south of Cape Cod and on the southern flank of Georges Bank. The 50-, 100-, and 1000-m isobaths, and the approximate location of the Oleander line (Flagg et al. 2006) are also shown. 6 \u03eb 10 \u03ea4 , except onshore of the 20-m isobath where the bottom is steeper by a factor of 5-10.\nMean wind stresses are \u03f30.03 N m \u03ea2 toward the southeast and spatially uniform (Fig. 1) . The mean wind stress is weak compared to the variability on time scales of days (standard deviations of \u03f70.1 N m \u03ea2 ) (e.g., Saunders 1977) . There is also a spatially uniform seasonal variation with monthly mean wind stresses of 0.07 N m \u03ea2 southeastward in winter (December and January) and 0.02 N m \u03ea2 northeastward in summer (June to August) .\nThe shelf waters of the MAB and Georges Bank exhibit a large seasonal variation in both temperature and stratification (e.g., Bigelow 1933; Mayer et al. 1979; Beardsley et al. 1985; Lentz et al. 2003; . In summer, the water is warm and thermally stratified due to strong surface heating and weak wind stresses. In winter, the water is cold and weakly stratified due to surface cooling and stronger wind stresses. Salinity over the shelf increases from about 32 near the coast to about 34 at the shelf break due to river discharges both within the MAB and to the north (Bigelow and Sears 1935; Chapman and Beardsley 1989; Loder et al. 1998; . A front, located near the shelf break, separates the cooler, fresher shelf water from the warmer, saltier slope water (e.g., Bigelow and Sears 1935; Linder and Gawarkiewicz 1998; Fratantoni and Pickart 2007) .\nRelative to other shelves, the MAB shelf is moderately wide, with a small bottom slope, a large seasonal variation in stratification, and it is strongly influenced by freshwater runoff and the presence of a shelf-slope front. The slope Burger number, defined as h x N\/f, is less than 0.1 for the MAB shelf, where N \u03f3 0 \u03ea2 s \u03ea1 is the buoyancy frequency and f \u03f3 10 \u03ea4 s \u03ea1 is the Coriolis frequency. This is small relative to, for example, continental shelves on the west coast of the United States and suggests the shelf response should be relatively barotropic.\nA sequence of studies using salinity and oxygen isotopic ratios (\u2426 18 O) as tracers have shown that the MAB shelf circulation is part of a continuous coastal current extending from Greenland to Cape Hatteras (Chapman et al. 1986; Chapman and Beardsley 1989; Loder et al. 1998; Khatiwala et al. 1999 ). The mean flow over the MAB shelf is southwestward along-isobath at 5-10 cm s \u03ea1 based on hydrography, drifters, shipboard current profiles, and moored current observations (e.g., Bigelow 1915; Bumpus 1973; Beardsley et al. 1976; Linder and Gawarkiewicz 1998; Lozier and Gawarkiewicz 2001; Shearman and Lentz 2003; Brink et al. 2003; Flagg and Dunn 2003; Flagg et al. 2006) . This mean flow turns offshore near Cape Hatteras and is entrained into the Gulf Stream (Bumpus 1973; Ford et al. 1952; Churchill and Berger 1998; Savidge and Bane 2001; Pietrafesa et al. 2002; Gawarkiewicz and Linder 2006) . The alongshelf mean flow increases with height above the bottom and distance offshore (Beardsley et al. 1976) .\nThe structure and magnitude of the mean cross-shelf circulation is less clear. Beardsley et al. (1976) note a tendency in moored current observations for onshore veering of the flow with depth. Bumpus (1965) inferred a divergence in the near-bottom cross-shelf flow with onshore flow shoreward of the 60-m isobath and offshore flow seaward of the 60-m isobath based on the small number of seabed drifters recovered that were deployed seaward of the 60-m isobath.\nThe dynamics of the mean circulation over the MAB shelf remain uncertain, in part because there have been few quantitative tests of dynamical models of the mean circulation. It has long been assumed that the MAB mean circulation is not driven solely by the local wind stress, which would tend to force an eastward flow on the New England shelf, opposite the observed mean circulation (Fig. 1) . Iselin (1939) argued that the MAB mean circulation is maintained by the cross-shelf buoyancy forcing associated with less dense (fresher) water near the coast. Subsequently, Sverdrup et al. (1942) argued that the southwestward flow was due to a poleward alongshelf pressure gradient estimated from geodetic leveling of tide gauges. However, subsequent studies suggested geodetic leveling was not accurate enough to determine the alongshelf pressure gradient (Sturges 1977) . Stommel and Leetmaa (1972) found that a steady, two-dimensional model (no alongshelf variations) forced by the mean wind stress and freshwater runoff could not account for the observed mean flow in the MAB and concluded there must be an alongshore sea surface slope y of order 10 \u03ea7 to drive the observed mean alongshelf flow equatorward. Scott and Csanady (1976) subsequently estimated y to be 1.4 \u03eb 10 \u03ea7 over the inner shelf off Long Island, New York, from the intercept of a linear regression analysis of the form sy \/ o \u03ed r b \u03e9 gh y , where sy is the alongshelf wind stress, o is the seawater density, r is a linear drag, b is the near-bottom alongshelf velocity, g is gravitational acceleration, and h is the water depth. The source of the inferred alongshelf pressure gradient has not been determined. Along-isobath buoyancy gradients associated with freshwater runoff extending as far north as the Arctic, the large-scale offshore circulation, or an upstream source have been suggested as possible causes (e.g., Beardsley and Winant 1979; Chapman et al. 1986 ). On the basis of model results, Beardsley and Winant (1979) argue that the alongshelf pressure gradient is imposed by the larger-scale circulation (Csan-ady 1978) . However, subsequent studies using simple barotropic models with bottom friction suggested that an alongshelf pressure gradient associated with the large-scale general circulation could not drive the shelfmean flow because the pressure gradient did not penetrate onto the shelf (Wang 1982; Chapman et al. 1986 ). The source of the inferred alongshelf pressure gradient is discussed further in section 6a.\nIn a particularly insightful study, Csanady (1976) used a steady, two-dimensional, constant eddy-viscosity model over a sloping bottom, similar to the model of Stommel and Leetmaa (1972) , to examine the dynamics of the MAB mean circulation. The model was forced by alongshelf and cross-shelf wind stress, a cross-shelf density gradient (buoyancy force), and an alongshelf pressure gradient. Csanady pointed out that for a spatially uniform alongshelf pressure gradient, the equatorward alongshelf mean flow should increase with water depth as observed. He also noted that the alongshelf pressure gradient accounts for the change in sign of the nearbottom cross-shelf flow suggested by the bottom drifter observations (Bumpus 1965) . Modified versions of Csanady's model with imposed alongshelf pressure gradients reproduce the main qualitative features of the mean circulation over both the southwest Nova Scotia shelf (Smith 1983 ) and the southern flank of Georges Bank (Butman et al. 1987 ). Characteristics of the observed mean circulation (described in sections 5a and 5b) are compared to a modified version of Csanady's model developed in this section. The model developed here avoids choosing an eddy viscosity by assuming constant surface and bottom boundary layer thicknesses and decomposing the flow into geostrophic and stress-driven boundary layer (Ekman) components ( Fig. 2; Dever 1997) . A right-handed coordinate system is used with y aligned alongshelf positive poleward, x positive offshore, and z positive upward. The mean alongshelf and cross-shelf currents vary in the cross-shelf direction, but do not exhibit any systematic variation along isobaths, except in the vicinity of Cape Hatteras (see sections 5a and 5b). This result indicates that a reasonable first approximation is to assume there are no alongshelf variations in the mean flow. With this assumption, assuming K h, and the boundary condition of no cross-shelf transport U at the coast, volume conservation implies\neverywhere, where u is the mean cross-shelf velocity and superscript \"da\" indicates the depth-average flow. The momentum balances are assumed to be steady and linear; that is, the nonlinear advective terms are assumed to be small. The alongshelf momentum balance,\nconsists of three terms: the Coriolis force fu, the alongshelf pressure gradient P y , and the alongshelf stress divergence y z . Subscripts (x, y, z) indicate partial derivatives. The cross-shelf momentum balance is assumed to be geostrophic,\nwhere is the alongshelf velocity. Previous observational studies indicate that the alongshelf velocity is primarily geostrophic at subtidal time scales (Brown et al. 1985; Lentz et al. 1999; Shearman and Lentz 2003) , assuming, however, a geostrophic balance neglects cross-shelf wind stress and wave forcing that may be important in shallow water Fewings et al. 2008; Lentz et al. 2008) . A derivation including  The cross-shelf circulation is assumed to consist of cross-shelf Ekman transports in the surface and bottom boundary layers and a geostrophic cross-shelf flow extending from the surface to the bottom. Surface and bottom Ekman velocities, vertically averaged over the boundary layers, are\nwhere by is the alongshelf component of the bottom stress, and \u2426 s and \u2426 b are surface and bottom boundary layer thicknesses. Assuming the flow is hydrostatic, P z \u03ed \u03eag, the geostrophic cross-shelf velocity is\nis the alongshelf buoyancy gradient and is density. From the alongshelf momentum balance (2), the steady cross-shelf current is geostrophic in the interior where\nIntegrating (2) from the surface to the bottom and using (1) and (5) yields\nNote that dividing (6) by f and using (4) indicates that the sum of the geostrophic and Ekman cross-shelf transports is zero,\nas it must be from (1). Offshore of the inner shelf, where the surface and bottom boundary layers do not overlap (h \u03fe \u2426 s \u03e9 \u2426 b ), the vertical average of the cross-shelf velocity in the surface boundary layer is the sum of the Ekman (4) and geostrophic velocities (5), Similarly, the vertical average of the cross-shelf velocity in the bottom boundary layer is\nwhere (6) was used to express by in terms of sy , y , and B y . To estimate the geostrophic alongshelf velocity, assume the flow is hydrostatic, which implies a thermal wind balance:\nSeveral previous observational studies support a thermal wind balance, even over the inner shelf Shearman and Lentz 2003; Garvine 2004; Codiga 2005) . Integrating the thermal wind balance (9) from the bottom to z yields an expression for the geostrophic alongshelf flow:  To simplify (12), assume constant cross-shelf and vertical density gradients in the interior, x \u03ed i x and z \u03ed i z , and a vertically well-mixed bottom boundary layer\nz for a constant thickness bottom boundary layer over a bottom with cross-shelf slope h x (defined as positive), and assuming y is constant, (12) reduces to da \u03ed\nwhere N 2 \u03ed \u03eag i z \/ o is the interior buoyancy frequency squared. The depth-averaged alongshelf flow is forced by the wind stress, alongshelf pressure gradient, and buoyancy gradients. The last term on the right-hand side is associated with buoyancy shutdown in the bottom boundary layer, the tendency for mixing and advection in the bottom boundary layer to establish crossisobath density gradients that reduce the near-bottom velocity and hence the bottom stress. The buoyancy shutdown term depends on the slope Burger number h x N\/f as expected from previous theory (e.g., Trowbridge and Lentz 1991) .\nThe contributions to da in (14) exhibit different dependencies on h. The alongshelf wind stress term is independent of h, as is the buoyancy shutdown term over the mid-and outer shelf where h k \u2426 b . The alongshelf sea surface slope and interior cross-shelf density gradient terms increase linearly with increasing h. The alongshelf density gradient term depends on h 2 , indicating it is larger over the outer shelf.\nThe mean circulation in this model, given by Eqs. (5), (7), (8), and (12) or (14), is forced by the mean alongshelf wind stress, alongshelf sea surface slope, and buoyancy (density) gradients. A more complete model would solve for the density field and the alongshelf sea surface slope given atmospheric forcing, river runoff, and offshore forcing associated with the open ocean. Thirty-three MAB current-meter records longer than 200 days were found (Table 1 ). The spatial coverage is sparse and the spatial distribution is not uniform (Fig.  1 ). There are a few sites at midshelf on the southern flank of Georges Bank, a relatively large number of sites on the New England shelf, and only a few sites in the central and southern MAB. Details about the current-meter observations from each site can be found in the associated references (Table 1) . Statistics for four of the sites are from Beardsley and Boicourt (1981) , Butman (1987) , and Biscaye et al. (1994) . For the other sites, the current time series were obtained and analyzed.\n\"Mean\" currents for each site were estimated as time averages over the duration of available data. The choice of 200 days is based on the desire to obtain mean current estimates with a standard error of the mean of 1 cm s \u03ea1 or less. The standard error of the mean, defined as std \/\u034cn, is a measure of how accurately an estimate represents the true mean current. Here std is the standard deviation of the detided current variability and n is the number of independent samples in the record. Standard deviations of detided current variability in the MAB are typically about 10 cm s \u03ea1 and decorrelation time scales are about 2 days, so record lengths of 200 days or longer are required to reduce the standard error to 1 cm s . This uncertainty does not account for potential bias errors in the current measurements due, for example, to improper averaging of surface gravity waves (e.g., Beardsley et al. 1981; . The uncertainty estimate also assumes there is not a large annual cycle in currents relative to the variability on time scales of days . Mean current estimates with or without removing an annual cycle are similar for the time series examined.\nDepth-averaged flows are estimated using trapezoidal integration and assuming the flow is vertically uniform near the boundaries to extrapolate to the surface and bottom. Results are similar if the velocity profile is extrapolated linearly to the surface and bottom. Vertical coverage varies substantially. Acoustic Doppler current profilers (ADCPs) have sample bins every meter or less over about 80% of the water column, while there are some moorings with as few as two current meters (Table 1) . Depth-average alongshelf velocities at a few sites were approximated with middepth currents because of poor data return from near-surface current meters. Depth-average alongshelf velocities at the two SEEP-I sites (Aikman et al. 1988 ) are approximated by 10-m currents since these outer-shelf and upperslope sites had only near-surface and near-bottom current meters.\nA coordinate system is adopted with y aligned with the depth-averaged mean flow da , but positive poleward, and x positive offshore. In this coordinate frame the depth-average mean cross-shelf velocity u da is zero by definition. This coordinate frame was chosen because it is consistent with the model assumptions in section 3, and it yielded more consistent mean crossshelf flow profiles than a coordinate frame aligned with the principal axes of the subtidal depth-averaged flow.\nThe sparse vertical coverage at many of the sites results in uncertainty in determining the orientation of the mean depth-average flow. The mean depth-average flow da is oriented roughly along-isobath ( Fig. 1) , but is not always aligned with the principal axes of the subtidal depth-average flow. In the northern MAB, da is oriented 0\u00b0-20\u00b0counterclockwise relative to the major axis orientations of the subtidal flow. The characteristics of the cross-shelf flow u are sensitive to the choice of coordinate frames, while the characteristics of the alongshelf flow are not sensitive to the choice of coordinate frames.\nBuoy wind observations are available at a few of the mooring locations. At current-meter sites without wind observations, the closest NDBC buoy or coastal wind observations are used (Fig. 1) . In a few cases, more than one buoy wind time series is used to get a complete wind record at the site. Wind stresses are estimated from wind velocities and sensor heights using a neutral bulk formula (Large and Pond 1981) .\nBottom stresses are estimated using unfiltered (hourly) near-bottom velocities and a quadratic drag law of the form\nThe magnitude and dependence of the drag coefficient C D on height above the bottom z\u0408 \u03ed h \u03ea z is determined empirically from near-bed covariance stress observations over Georges Bank (see appendix B). From the Georges Bank estimates, C D \u03f7 0.001 at 5 m above the bottom. Comparisons with covariance stress estimates from two other sites suggest an uncertainty of about 50% (appendix B).\nMean cross-shelf and along-isobath density gradients are estimated using density profiles from the National Oceanographic Data Center's World Ocean Database 2001 archive of ship observations. The observations were quality controlled and water depths were deter- * Mean alongshelf current at middepth rather than depth average. ** Mean alongshelf current at 10 m rather than depth average. mined using the National Geophysical Data Center high-resolution bathymetry for the region . A total of 20 158 profiles over the shelf (water depth \u0545100 m) were extracted, excluding profiles in Chesapeake Bay, Delaware Bay, Long Island Sound, Buzzards Bay, and Nantucket Sound. Each shelf profile was interpolated onto a 5-m vertical grid. The mean density gradients along the 30-, 50-, 70-, and 90-m isobaths are estimated by finding all profiles in a 20-m depth band around a given isobath, for example, between the 60-and 80-m isobaths for estimates along the 70-m isobath. An average annual cycle is removed to minimize biases associated with uneven seasonal sampling. Mean along-isobath trends are estimated at each 5-m depth interval. The mean depth- , about the same size as the 95% confidence bounds for the trend estimate. At the surface there is a slightly larger y of the opposite sign.\nTo estimate the mean cross-shelf density gradient x , the minimum distance to the 100-m isobath is determined for each density profile, an average annual cycle is removed, and the cross-shelf trend is estimated for all profiles between the 20-and 100-m isobaths. The estimated mean cross-shelf density gradient is x \u03f7 4 \u03eb 10\n, which is significantly different from zero at the 95% confidence level. Estimates are slightly higher near the surface and bottom [5 \u03ea 6 \u03eb 10 \u03ea6 (kg m ]. Estimates of x are essentially the same for different subsets of the data, that is, the New England shelf or southern and central MAB.\n), indicating da is not primarily driven by the local wind stress (see section 5c).\nThe alongshelf velocity (z) increases with height above the bottom except near the surface (Fig. 4) . The profiles tend to be linear over the inner shelf and curved over the mid-to outer shelf with stronger vertical shear in the lower half of the water column. Vertical shears are 2-4 \u03eb 10 \u03ea3 s \u03ea1 near the bottom at all water depths, decrease with height, and are near zero more than 50 m above the bottom (Fig. 5) .\n) near the surface, maximum onshore flow of \u03f30.5 cm s \u03ea1 at middepth and generally weak crossshelf flow near the bottom (Fig. 6b) . Over the outer shelf, there is offshore flow (0-4 cm s\n) near the surface, maximum onshore flow of 0.5-1 cm s \u03ea1 at middepth, and offshore flow of 1 cm s \u03ea1 near the bottom (Fig. 6c) .\nNear-surface cross-shelf flows (u s ) are variable with a tendency for stronger offshore flows in the northern MAB and weaker cross-shelf flows in the southern MAB (\u1b7a in Fig. 7) . The maximum onshore \"interior\" flow (u i ) at each site (\u2022 in Fig. 7 ) is remarkably consistent, ranging between \u03ea0.2 and \u03ea1.4 cm s\n, except at one of the southern sites where u i \u03f7 0.2 cm s \u03ea1 (alongshelf distance \u03ea1300 km in Fig. 7) . The onshore interior flow compensates for the near-surface and near-bottom (discussed below) offshore flows and is also consistent with an alongshelf pressure gradient (higher pressure to the north) and a geostrophic interior as discussed below. Neither the near-surface flow nor the maximum onshore flow exhibit any obvious dependence on water depth (not shown).\nIn contrast, near-bottom cross-isobath flows (u b ) tend to increase linearly with water depth from \u03ea1 cm s \u03ea1 (onshore) in shallow water to \u03f34 cm s \u03ea1 (offshore) at the 250-m isobath (Fig. 8) . The correlation between h and u b is 0.78 and the regression slope is 0.018 \u03ee 0.004 cm s \u03ea1 m \u03ea1 . Farther seaward over the slope, the near-bottom offshore flow decreases to 1-2 cm s\n. Figure 8 includes observations summarized in two previous studies of near-bottom flow over the MAB shelf and slope (Butman 1987; Csanady et al. 1988 ). The observations of u b support Bumpus' (1965) conclusion from bottom drifter returns that the nearbottom flow is onshore shoreward of the 60-m isobath and offshore seaward of the 60-m isobath. Mean nearbottom cross-shelf flows near the mouth of Delaware Bay are also onshore at about 1 cm s \u03ea1 (Pape and Garvine 1982; Garvine 1991) . The moored currents indicate that u b crosses zero at about the 50-m isobath. The observed cross-shelf circulation pattern is summarized in Fig. 2 . Over the mid-and outer shelf there is an offshore flow near the surface, onshore flow in the interior, and offshore flow near the bottom. Over the inner shelf, there is offshore flow near the surface and onshore flow near the bottom. The near-bottom flow increases linearly with distance offshore from the inner shelf to the upper slope. While the mean cross-shelf velocities (Figs. 6, 7, and 8) are similar in magnitude to the accuracy of the current measurements, the consistency of the circulation pattern suggests they are real.\n, f \u03ed 0.9 \u03eb 10 \u03ea4 s \u03ea1 , and h x \u03ed 6 \u03eb 10\n. The \"interior\" mean cross-shelf flow is onshore at about 1 cm s \u03ea1 and remarkably constant, both spatially and between different field programs (Fig. 7) . The near-bottom mean flow increases with water depth from \u03ea1 cm s \u03ea1 (onshore) near the coast to 4 cm s \u03ea1 (offshore) over the slope (Fig. 8) . The change in sign of the near-bottom cross-shelf flow occurs at about the 50-m isobath, consistent with Bumpus' results based on bottom drifter returns. The cross-shelf profiles have a two-layer structure over the inner shelf with offshore flow near the surface and onshore flow near the bottom and a three-layer structure over the mid-and outer shelf with offshore flow near the surface and bottom, and onshore flow in the interior (Fig. 6) . Assuming no alongshelf variations in the circulation, the convergence of the interior onshore transport due to the decreasing water depth feeds the divergence of the offshore flow in the bottom boundary layer as shown schematically in Fig. 2 . There is also upwelling over the inner shelf where the near-bottom onshore flow feeds the offshore transport in the surface boundary layer.\nThe observed circulation is consistent with a steady, two-dimensional dynamical model (developed in section 3) similar to a model proposed by Csanady (1976) . The cross-shelf momentum balance is assumed to be geostrophic. The mean equatorward alongshelf flow is driven by a large-scale, alongshelf pressure gradient (or some other spatially uniform body force), a cross-shelf buoyancy forcing associated primarily with the crossshelf salinity gradient, and an alongshelf wind stress. Thus, in the depth-averaged alongshelf momentum balance, the alongshelf pressure gradient (or other body force) is balanced by the wind stress and bottom stress.\nEstimates of the near-surface and near-bottom crossshelf transports are in rough agreement with Ekman transports associated with the wind stress and bottom stress, respectively (Fig. 9) . Measurements of the mean alongshelf pressure gradient P y do not exist. However, independent estimates of P y based on the depthaveraged alongshelf momentum balance and assuming the interior onshore flow is geostrophic are significantly correlated, though the geostrophic estimates are about 1.6 times larger than the estimates from the depthaveraged momentum balance. Both estimates indicate that the mean alongshelf pressure gradient decreases to near zero in the region south of Chesapeake Bay. North of Chesapeake Bay, the mean alongshelf sea surface slope is estimated to be 3.7 \u03eb 10 \u03ea8 based on the depthaverage momentum balance. The associated alongshelf pressure gradient contribution to the depth-averaged alongshelf flow is about 8 times larger than the crossshelf buoyancy force contribution estimated from historical hydrographic data. The model reproduces the observed magnitude and linear increase with water depth in both the depth-averaged alongshelf flow over the mid-and outer shelf (Fig. 3) and in the nearbottom, cross-shelf flow, including the reversal to onshore flow shoreward of the 50-m isobath (Fig. 8) . The increase in the near-bottom offshore flow is a result of the changing relative magnitudes of the geostrophic interior onshore flow and the offshore flow driven by the bottom stress. Acknowledgments. The author is grateful to various researchers (too numerous to list) that collected the historical data used here. The remarkably consistent picture of the mean circulation that emerged from this study is a testament to the care and effort that went into collecting these observations. Collection of the observations was funded by Department of Energy, Minerals Management Service, National Science Foundation, National Oceanic and Atmospheric Administration, and the Office of Naval Research. John Trowbridge graciously provided covariance estimates of bottom stress essential to this study. The author also appreciates comments and suggestions on an early draft of this manuscript by Bob Beardsley, Ken Brink, and Melanie Fewings. The author is especially grateful to Jamie Pringle for pointing out several typographical errors in an early version of the model description and for motivating a more complete and clearer derivation of the model. This research was funded by Ocean Sciences Division of the National Science Foundation under Grants OCE-820773, OCE-841292, and OCE-848961.\nfor both components, with 95% confidence intervals of \u03ee0.5 \u03eb 10 \u03ea4 m s \u03ea1 for the cross-shelf component and \u03ee0.6 \u03eb 10 \u03ea4 m s \u03ea1 for the alongshelf component.   Consistent with previous studies (e.g., Beardsley et al. 1976; Pietrafesa et al. 2002) , depth-average mean currents at all sites over the MAB shelf are equatorward and approximately along-isobath ( Fig. 1; Table 1 ). The exception to this is the flow within Hudson shelf valley, the only canyon that extends across the entire MAB shelf. Hudson shelf valley has a dramatic influence on the local circulation, with mean along-valley flows that are onshore at 5-10 cm s \u03ea1 (Nelsen et al. 1978; Mayer 1982; Manning et al. 1994; Harris et al. 2003) . The subsequent analysis does not include the current observations from the two sites within Hudson shelf valley.\nDepth-average mean alongshelf currents da increase with increasing water depth h over the shelf (h \u0545 120) from \u03ea3 cm s \u03ea1 in 15-m water depth to \u03ea10 cm s \u03ea1 in 100-m water depth (Fig. 3) . The correlation between h and da over the shelf is \u03ea0.83 (significantly different from zero at the 99% confidence level), with a regression slope of \u03ea0.07 \u03ee 0.02 cm s \u03ea1 m \u03ea1 and an intercept of \u03ea1.8 \u03ee 1.2 cm s \u03ea1 . The most notable outlier over the shelf is from a site near the coast (water depth 20 m), south of Chesapeake Bay, where there is an enhanced southward flow that is probably associated with the buoyant coastal current from Chesapeake Bay (Rennie et al. 1999) . Observations of da from repeated shipboard ADCP transects along the Oleander line (Flagg et al. 2006) show more scatter than the moored observations in the depth range from 65 to 75 m because the Oleander line passes over Hudson shelf valley in this depth range (Fig. 1) . The consistency of the relationship between h and da over the MAB shelf suggests the mean currents do not vary substantially along isobath between Georges Bank and Cape Hatteras, though the spatial coverage is sparse (Fig. 1) . The consistency of the relationship also suggests interannual variations over the last few decades are small, typically 1-2 cm s \u03ea1 or less, since the means span different time periods.\nIt is interesting that the shelfbreak jet does not stand out in the mean depth-averaged flow. In particular, da from two sites near the shelf break in the mid-and southern MAB (Fig. 1) , where the shelf break is shallower, exhibit the same dependence on h. Over the upper slope, between the 100-and 150-m isobaths, core rings (Fratantoni and Pickart 2003; Flagg et al. 2006) .\nMean wind stresses are generally southeastward (offshore), opposing the mean flow in the northern MAB, perpendicular to the mean flow in the central MAB and the southern flank of Georges Bank, and partially in the direction of the mean flow in the southern MAB (Fig.  1) . The depth-average mean alongshelf currents in the absence of wind forcing were estimated as the intercepts of a linear regression between the wind stress and da for the temporal lag and wind stress orientation that yielded the maximum correlation between s and da . The intercept estimates of the depth-average mean alongshelf currents in the absence of wind forcing (not shown) are similar to da and exhibit the same dependence on water depth (correlation \u03ea0.80 and regression slope \u03ea0.08 \u03ee 0.02 cm s \u03ea1 m In this coordinate frame, where u da \u03ed 0, there is a consistent vertical and cross-shelf structure to the mean cross-shelf flow u(z) (Fig. 6) . Over the inner shelf (water depths 10-15 m), there is a two-layer structure with offshore flow of 1 cm s \u03ea1 near the surface and onshore flow of 1 cm s \u03ea1 near the bottom (Fig. 6a ) (see also Codiga 2005) . Over the midshelf, there is offshore flow (1-3 cm s The model developed in section 3 provides estimates of da from [(12) or (14)], u s from (7), u b from (8), and\n, r, and the forcing terms: the wind stress, the alongshelf sea surface slope, and the buoyancy (density) gradients. In this section, model estimates are compared to the observed mean flow.\nFirst, the proposed mean Ekman balances (4) are evaluated by comparing the alongshelf surface (bottom) Ekman transport estimates based on the surface (bottom) stress at each site to rough estimates of the near-surface (near bottom) cross-shelf transport deficit\n] at each site. Assuming constant boundary layer thicknesses \u2426 s \u03ed \u2426 b \u03ed 15 m based on the observed mean cross-shelf current profiles (Fig. 6) , there is reasonable agreement between the two terms in the Ekman balances for both the surface and bottom boundary layers (Fig. 9) . For the surface layer, the correlation is 0.52 and the regression slope is 0.9 \u03ee 0.7. For the bottom layer, the correlation is 0.37. However, if the two sites south of Chesapeake Bay are excluded (two of the points below dashed line in Fig. 9b ), the correlation is 0.66 and the regression slope is 1.4 \u03ee 0.8. The two sites south of Chesapeake Bay have only three current meters spanning the water column and do not exhibit the same vertical structure seen at the other sites. Between Chesapeake Bay and Cape Hatteras there is also a tendency for the mean flow to turn offshore (Pietrafesa et al. 2002) , which may influence these relationships. This evaluation is crude and is only intended to show there is a rough relationship between Ekman and observed transports. A more careful analysis would integrate currents over a variable boundary layer thickness, but the necessary observations are not available at most of the sites. . Reliable direct estimates of the mean alongshelf pressure gradient do not exist. However, Eqs. (5) and (6) provide two independent estimates of P y \u03ed o g y (since y \u03ed 0 and therefore B y \u03ed 0). The two estimates of P y are correlated (Fig. 10) except for the three sites in less than 20 m of water where one does not expect a geostrophic interior (Fig. 6a) discrepancy could be due to a bias in the bulk wind stress estimates or bias errors in the mean interior cross-shelf currents. The smaller alongshelf pressure gradients estimated from the surface and bottom stresses are more consistent with the observed depthaveraged alongshelf velocity and the near-bottom cross-shelf velocity than pressure gradient estimates based on the interior cross-shelf currents (see below). This suggests the cross-shelf interior currents are overestimated, possibly due to wave-bias errors (Beardsley et al. 1981; . The pressure gradient estimates decrease toward zero in the region south of Chesapeake Bay (evident from u i in Fig. 7) , as suggested by Bush and Kupferman (1980) . This is qualitatively consistent with the tendency for the shelf flow to turn offshore between Chesapeake Bay and Cape Hatteras (Pietrafesa et al. 2002; Gawarkiewicz and Linder 2006 Scott and Csanady (1976) from the inner shelf south of Long Island. The larger inferred pressure gradient may be associated with the coastal geometry since both sites are located offshore of islands, or there may be an enhanced alongshelf pressure gradient over the inner shelf due, for example, to buoyant gravity currents (e.g., Rennie et al. 1999) .\nAccurate measurement of the implied alongshelf pressure gradient is challenging, as it requires measuring the sea surface slope relative to the geoid with an accuracy of better than 1 cm over 1000 km. Additionally, one must separate the mean alongshelf pressure gradient y from the larger mean cross-shelf pressure gradient x . In the following, a mean sea surface slope of y \u03ed 3.7 \u03eb 10 \u03ea8 is assumed based on the stress estimates and the depth-averaged alongshelf momentum balance (6).\nModel estimates of u b from (8) (Fig. 8, dashed line) . In particular, the model predicts a linear increase in u b with increasing water depth h, onshore near-bottom flow shoreward of the 50-m isobath, and approximately the correct magnitude for the near-bottom flow out to the 250-m isobath. The agreement between the observed and modeled u b provides further support for the magnitude of the inferred alongshelf pressure gradient, as previously noted by Csanady (1976) , and suggests P y is roughly constant out to the 250-m isobath. The model also provides a simple dynamical explanation for the reversal in u b at about the 50-m isobath. If P y does not vary across the shelf, as assumed, then from (5) the onshore interior velocity u g is also constant. However, from (6), by must increase as the water depth increases to balance P y h (assuming sy is approximately constant) because the pressure gradient is a body force. This implies the offshore Ekman \nwhich for the model inputs used yields h br \u03ed 55 m. Thus, onshore of the 55-m isobath u be \u03fd \u03eau g , while offshore of the 55-m isobath u be \u03fe \u03eau g . Model estimates of da from (12) are also in reasonable agreement with the observations for water depths between the 50-and 100-m isobaths (Fig. 3) . Over the inner shelf, where the model is not valid because h \u0545 \u2426 s \u03e9 \u2426 b , model estimates of da (dashed line in Fig. 3 ) are closer to zero than the observed flow. Stronger cross-shelf and alongshelf buoyancy forcing associated with local river discharges (e.g., Munchow and Garvine 1993; Rennie et al. 1999; Munchow and Chant 2000; Ullman and Codiga 2004) , the steeper bottom slope over the inner shelf, the influence of cross-shelf winds ) and\/or surface waves , and weaker mean alongshelf wind stresses near the coast (Saunders 1977 ; Fig. 1 ) may all influence the inner-shelf response.\nThe contributions of the four forcing terms to da in (14) indicate that the alongshelf pressure gradient dominates the response and primarily accounts for the linear increase of da with h over the mid-and outer shelf (Fig. 11) . The alongshelf wind stress drives an opposing mean flow. The cross-shelf buoyancy force contribution from the interior cross-shelf density gradient is about a factor-of-8 smaller than the alongshelf pressure gradient term, although as noted above, the cross-shelf buoyancy force may be more important over the inner shelf. The last term in (14), associated with buoyancy shutdown in the bottom boundary layer (dotted line in Fig. 11) , is approximately constant over the mid-and outer shelf and also relatively small compared to the pressure gradient, consistent with the small slope Burger number (h x N\/f \u03ed 0.06) characterizing the MAB mid-and outer shelf. While the observations examined here provide a remarkably consistent picture of the MAB mean circulation, several limitations in both the observations and model are evident. The spatial coverage of the current observations is sparse and uneven. As a result, the mean circulation picture may be biased toward conditions on the New England shelf. Direct measurements of the mean alongshelf pressure gradient are a key missing element in testing the proposed model. However, given the estimated size of the sea surface slope, the difficulty of making accurate mean pressure measurements, and the uncertainty in the reference geoid, obtaining an accurate estimate of the mean pressure gradient seems problematic. Another large uncertainty in the present analysis is bottom stress. More covariance estimates of near-bottom stress are needed. Existing estimates (appendix B) are inconsistent with log profile estimates, even relatively close to the bottom, suggesting this may not be an effective way to estimate the bottom stress over the MAB shelf.\nThe model developed in section 3 is incomplete in the sense that both the alongshelf pressure gradient and the cross-shelf density gradient are prescribed as forcings, although they are clearly part of the larger-scale response (e.g., Beardsley and Winant 1979) . Numerical models with more complete physics may provide the needed insight into the dynamics associated with establishment of both the alongshelf pressure gradient and the cross-shelf density structure. Brief discussions of these two terms are given below, followed by estimates of the alongshelf transports. As noted in section 2, the alongshelf pressure gradient has been attributed to either the alongshelf buoyancy forcing associated with coastal runoff or the larger-scale ocean circulation. The mean depthaveraged along-isobath density gradient over the MAB shelf determined from the archived hydrographic profiles is zero to the accuracy of the estimate and does not drive the mean alongshelf flow, in support of Stommel and Leetmaa's (1972) results. The agreement between the observations and model estimates of da and u b also suggest an alongshelf pressure gradient that does not vary across the shelf. These results are consistent with model results that indicate a pressure gradient imposed by the large-scale circulation would not vary across the shelf (Csanady 1978; Beardsley and Winant 1979) , but it remains unclear how such a pressure gradient would penetrate onto the shelf (Wang 1982; Chapman et al. 1986 ). The observed flow may also be the result of an upstream volume transport onto the shelf associated with flow from the Arctic and Hudson Strait that no longer has a significant alongshelf buoyancy gradient.\nThough an alongshelf pressure gradient forcing has been assumed here and in previous studies, the observations are consistent with any body force that is spatially uniform and has the magnitude of the estimated pressure gradient. In particular, the observed mean flows could be forced by eddy variability as characterized in studies using statistical mechanics to parameter- Haidvogel and Brink 1986; Samelson and Allen 1987) . Both these models predict mean flows in the same direction as observed. However, at present it is difficult to evaluate these models using oceanic observations because the models do not provide characterizations of the mean flow in terms of clearly defined observable variables and generally do not consider forced-dissipative systems or stratification. For variable flow over rough bathymetry assuming topographic bumps with a height of 5 m, estimates of the mean flow magnitude from both the theory [Eq. (5.1) in Samelson and Allen 1987] and the numerical model results (Haidvogel and Brink 1986) are about an order of magnitude smaller than the observed mean alongshelf flow. A simple estimate proposed by Merryfield et al. (2001) for an eddy forced, barotropic mean flow is\nwhere L is an eddy length scale. If L is constant, then is inversely proportional to h, which is not what is observed in Fig. 3 . However, assuming L \u03f7 Nh\/f is proportional to the baroclinic deformation radius yields\nfor N \u03ed 10 \u03ea2 s . This is remarkably close to the observed relationship, suggesting eddy variability is a plausible forcing mechanism that warrants further attention.\nh from the linear regression analysis in section 5a (mks units). For a linear bottom slope, the volume transport is proportional to h 3 , which means the transport estimate is sensitive to the choice of x sb . Volume transports were estimated by numerically integrating (18) for six cross-shelf transects of water depth h(x) within the MAB (Table 2 ). Transports are similar to the previous estimates cited above. The transport estimates decrease from Cape Cod to North Carolina because of the corresponding reduction in cross-sectional area, since da does not vary along isobaths. The alongshelf transport divergence is roughly constant and corresponds to a depth-averaged offshore flow of slightly less than 1 cm s \u03ea1 at the shelf break.\nThe transport estimates from (18) are not the entire shelf-water transport. The shelf-slope front separating the shelf water from the slope water is typically near the shelf break at the bottom but extends farther offshore near the surface (Linder and Gawarkiewicz 1998) . Since the transport estimates from (18) are for the area onshore of the shelf break (a particular isobath), they do not include the shelf-water transport offshore of the shelf break. Therefore the total shelf-water transport is almost certainly larger and may be conserved alongshelf if the shelf-water transport offshore of the shelf break increases from north to south. The cross-shelf density gradient is primarily due to the cross-shelf salinity variation from relatively salty open-ocean water to fresher water near the coast resulting from the tendency for freshwater discharge from rivers and estuaries to turn and flow alongshelf as narrow buoyant gravity currents. The mean cross-shelf distribution of density presumably depends on vertical and lateral mixing, but the specific process(es) are not clear. Wind forcing, tidal mixing, and instabilities are likely processes. A model proposed by Chapman and Lentz (2005) is based on the idea that the basic mechanism for establishing the cross-shelf density gradient is buoyancy advection and vertical mixing within the bottom boundary layer. The Chapman and Lentz model solves the same momentum equations as in section 3, but includes a density equation and hence solves for the cross-shelf density structure assuming an initial stratification. The bottom boundary layer is assumed to be vertically mixed in density. Lateral density gradients form because of the sloping bottom, the interior stratification, and cross-isobath advection in the bottom boundary layer. Consequently, there is a vertically sheared alongshelf flow in thermal wind balance with the crossisobath density gradients in the bottom boundary layer that reduces the near-bottom flow and hence the bottom stress, that is, buoyancy shutdown in the bottom boundary layer. The model is not valid over the inner shelf. Assuming the same shelf parameters and forcing as before (except for the cross-shelf density gradient), the Chapman and Lentz (2005) model yields a reasonable cross-shelf density gradient and reproduces the observed da (dash-dot line, Fig. 3) . However, the steadystate density field in the model is vertically uniform, which is inconsistent with the observations, possibly because the model does not include the annual variation in surface heating or river discharges. A number of previous studies have made estimates of the alongshelf volume transport in the MAB (Beardsley et al. 1976; Ramp et al. 1988; Biscaye et al. 1994; Loder et al. 1998; Churchill and Berger 1998; Savidge and Bane 2001; Flagg and Dunn 2003) . The observed relationship between da and h (Fig. 3) provides a potentially more robust volume transport estimate since the relationship is based on a larger number of currentmeter records. Volume transports are estimated as\nwhere x sb is the offshore location of the shelfbreak and da \u03ed \u03ea1.8 \u03eb 10 \u03ea2 \u03ea 7.0 \u03eb 10 Analysis of moored current observations from the Middle Atlantic Bight shelf reveal a consistent mean circulation pattern. Time series longer than 200 days were used in the analysis so that the standard error of the mean flow is 1 cm s \u03ea1 or less. A coordinate system is used with the alongshelf direction aligned with the mean depth-averaged flow, which tends to be roughly along-isobath.\nThe mean depth-averaged flow is equatorward, alongshelf, and increases linearly with increasing water depth from 3 cm s \u03ea1 at the 15-m isobath to 10 cm s \u03ea1 at the 100-m isobath (Fig. 3) . The mean alongshelf flow increases with height above the bottom (Fig. 4) with a maximum vertical shear of 2-4 \u03eb 10 \u03ea2 s \u03ea1 near the bottom decreasing to approximately zero more than 50 m above the bottom (Fig. 5) .\nThe depth-averaged cross-shelf flow is zero by definition in the coordinate system used. The mean crossshelf circulation is weak, but exhibits a consistent crossshelf and vertical structure (Fig. 2) . The near-surface flow is typically offshore but variable ranging from \u03ea2 to 4 cm s  The alongshelf velocity is estimated here assuming the cross-shelf momentum balance includes the stress divergence term rather than assuming a geostrophic balance as in section 3c. The geostrophic component is given by (10). However, the alongshelf velocity at the bottom bot is bot \u03ed be \u03e9 gb , \u0351A4\u0352\nwhere gb is the geostrophic velocity at the bottom. Equating (11) and (A4), using (A2), and solving for \/h \u03fd 1, the terms containing \u2425 2 are small. The cross-shelf wind stress term is also small except over the inner shelf, since \u2425 \u03f7 0.18, \u2426 b \/h \u03fd 1, and sx is comparable to sy (see also Fewings et al. 2008) . Neglecting the terms involving \u2425 2 and the cross-shelf wind stress term, (A6) reduces to (12).  To estimate bottom stress at each of the MAB sites, a quadratic drag coefficient C D as a function of height above the bottom z\u0408 was determined using near-bottom covariance stress estimates from the southern flank of Georges Bank [J. Trowbridge 2006 , personal communication; see also Werner et al. (2003) for details of the measurements]. The basic approach for making the covariance stress estimates, including removal of wave biases, is outlined in Shaw et al. (2001) and Trowbridge (1998) . The drag coefficient was estimated as the slope of a linear regression of the form\nwhere | u\u0408w\u0408| is the magnitude of the covariance stress estimate and b is an intercept. Comparison with C D estimates from a midshelf (Shaw et al. 2001 ) and an inner-shelf site (J. Trowbridge 2006, personal communication) , both south of Cape Cod, Massachusetts, indicate about a factor-of-2 variation in the near-bottom C D profiles from the three sites (Fig. B1) . The New England midshelf site was located in a region of silt (called the \"mud patch\"), and C D is relatively small because of near-bottom stratification by suspended sediment (J. Trowbridge 2006, personal communication) . Time series of bottom stress b were estimated for each of the 29 sites where hourly time series were available. The drag coefficient C D was estimated by interpolating the Georges Bank C D profile to the height of the current meter nearest the bottom and then bottom stress was estimated using a quadratic drag law provides an accurate estimate of the mean bottom stress (Fig. B2) . The correlation is 0.90 for the crossshelf component and 0.84 for the alongshelf component, both significant at the 95% confidence level. The estimated linear drag coefficient is r \u03ed 2.5 \u03eb 10 \u03ea4 m s",

)

for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    if save_model:
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(
        model, sentence, context, answer, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.c.to(device)
        target = batch.a.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

# running on entire test data takes a while
score = bleu(test_data[1:100], model, context, answer, device)
print(f"Bleu score {score * 100:.2f}")
